Gupta normalised dataset (http://www.arkinglab.org/resources/)

Note: Raw data can be found in the National Database for Autism Research (NDAR) under the accession code NDARCOL0002034 but you need to request access

Load data

Note: The rownames in the expression dataset have the following structure: ensemble ID + ‘.’ + gene name + n. I have to find what the n stands for (just different measurements of the gene? polymorphisms? other?)

# LOAD METADATA
datMeta =  read.delim('./../Data/Gupta/AUT_pheno.txt')
datMeta = datMeta %>% dplyr::select(-matches('PC|SV|MEDIAN|.01'))
datMeta$age_group = cut(datMeta$Age, c(-0.5, 0, 0.6, 10, 20, 30, 40, 50, 60, 70, 80),
                    labels=c('Fetal','Infant','Child','10-20','20s','30s','40s','50s','60s','70s'))


# LOAD EXPRESSION DATA
datExpr = read.delim('./../Data/Gupta/NormalizedGeneExpression_AUT.txt')


# ANNOTATE GENES
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                 values=sub('\\..*', '', rownames(datExpr)), mart=mart)
datGenes = datGenes[match(sub('\\..*', '', rownames(datExpr)), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position
datGenes$ID_match_datExpr = paste(datGenes$ensembl_gene_id, gsub('-','.',datGenes$external_gene_id), sep='.')

# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv')


# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID' = as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal' = 1)


# Combine SFARI and GO information
gene_info = data.frame('ID'=sub('\\..*', '', rownames(datExpr))) %>% left_join(SFARI_genes, by='ID') %>% 
            mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
            left_join(GO_neuronal, by='ID') %>% 
            mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
            mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))


rm(getinfo, mart, GO_annotations)


Check sample distribution

RNA-Seq for 104 cortical brain-tissue samples across three brain regions (BA10, BA19 and BA44/45), comprising 57 samples from 40 control subjects and 47 samples from 32 ASD subjects

print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$sampleid)), ' different subjects.'))
## [1] "Dataset includes 48216 genes from 104 samples belonging to 72 different subjects."


Diagnosis distribution: Fairly balanced

table(datMeta$Dx)
## 
##  Autism Control 
##      47      57


Brain region distribution: Most samples (~60%) belong to Brodmann area 19

Where: 1=BA10, 2=BA19 and 3=BA44/45

table(datMeta$brainregion)
## 
##  1  2  3 
## 14 62 28


Diagnosis and brain region are balanced, although not that much

table(datMeta$Dx, datMeta$brainregion)
##          
##            1  2  3
##   Autism   6 24 17
##   Control  8 38 11


Sex distribution: There are three times more Male samples than Female ones

table(datMeta$Gender)
## 
##  F  M 
## 24 80


Age distribution: Subjects between 2 and 82 years old with a mean close to 20

summary(datMeta$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    9.00   18.00   20.81   23.25   82.00



Filtering

Note: There are some NAs in the expression data, but before diciding which genes/samples to remove it would be better to remove low expressed genes (maybe they have many of the NAs)

print(paste0('There are ', sum(is.na(datExpr)), ' NAs in the expression data'))
## [1] "There are 18060 NAs in the expression data"
  1. Filter genes with low expression levels

Seems like there is a really dense concentration of point with mean expression ~ -1 and sd ~ 1.3

mean_expr = data.frame('ID'=rownames(datExpr),'Mean'=rowMeans(datExpr, na.rm=T),
                       'SD'=apply(datExpr,1,function(x) sd(x, na.rm=T)))

p1 = mean_expr %>% ggplot(aes(Mean, SD)) + geom_point(alpha=0.01, color='#0099cc') + geom_smooth(method='lm', color='gray') +
     geom_vline(xintercept=-2.8, color='gray', linetype='dashed') + theme_minimal()

p2 = mean_expr %>% ggplot(aes(Mean)) + geom_density(fill='#0099cc', color='#0099cc', alpha=0.5) + 
     geom_vline(xintercept=-2.8, color='gray', linetype='dashed') + theme_minimal()

grid.arrange(p1, p2, ncol=2)

rm(mean_expr, p1, p2)

Filtering out genes with a mean expression lower than -2.8

to_keep = rowMeans(datExpr, na.rm=TRUE)>-2.8
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep, na.rm=T), ', ', sum(to_keep, na.rm=T), ' remaining'))
## [1] "Removed 1314, 46902 remaining"
rm(to_keep)


  1. Remove NAs in the data
missing_points = is.na(datExpr)

print(paste0('There are ', sum(is.na(datExpr)), ' NAs in the expression data'))
## [1] "There are 17805 NAs in the expression data"

\(\qquad\) 2.1 Filtering samples with the highest number of missing values (chose to remove the top 2 samples, which have over 8 times the average number of missing values per sample)

nas_by_sample = data.frame('ID' = colnames(datExpr),
                           'NAs' = datExpr %>% apply(2, function(x) x %>% is.na %>% sum))

ggplotly(nas_by_sample %>% ggplot(aes(reorder(ID,-NAs), NAs, fill=NAs)) + geom_bar(stat='identity') + 
         xlab('Sample') + ylab('NA count') + scale_fill_viridis() + theme_minimal() + 
         geom_hline(yintercept=sum(is.na(datExpr))/ncol(datExpr), color='gray') + ggtitle('Missing values by sample') +
         theme(axis.text.x = element_blank(), axis.ticks = element_blank(), legend.position='none'))
print(paste0('Removing samples ', paste(nas_by_sample$ID[nas_by_sample$NAs>1300], collapse = ' and ')))
## [1] "Removing samples ba19.s31 and ba19.s40"
to_keep = which(!colnames(datExpr) %in% nas_by_sample$ID[nas_by_sample$NAs>1300])
datExpr = datExpr[,to_keep]
datMeta = datMeta[to_keep,]

print(paste0('Removed ', length(!to_keep), ' samples, ', length(to_keep), ' remaining'))
## [1] "Removed 102 samples, 102 remaining"
rm(nas_by_sample, to_keep)


\(\qquad\) 2.2 Filter genes with the highest number of missing values (chose to filter the genes with more than 3% missing values (3 or more))

nas_by_gene =  data.frame('ID' = rownames(datExpr),
                           'NAs' = datExpr %>% apply(1, function(x) x %>% is.na %>% sum))

nas_by_gene %>% ggplot(aes(NAs)) + geom_bar(fill='#0099cc') + scale_y_sqrt() + theme_minimal()

to_keep = nas_by_gene$NAs<3
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## [1] "Removed 356 genes, 46546 remaining"
rm(nas_by_gene)

\(\qquad\) 2.3 Impute missing values using the mice (Multiple Imputations by Chained Equations) package

  1. Remove all samples with more than 550 missing values
to_keep = apply(datExpr, 2, function(x) sum(is.na(x)))<550
datExpr = datExpr[,to_keep]
datMeta = datMeta[to_keep,]

print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 6 samples, 96 remaining"
  1. Remove all genes with missing values
to_keep = apply(datExpr, 1, function(x) sum(is.na(x)))==0
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## [1] "Removed 8287 genes, 38259 remaining"


  1. Filter outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'Age'=datMeta$age_group)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Age)) + geom_point() + geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: ba10.s23, ba19.s36, ba19.s7"
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 3 samples, 93 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 38259 genes and 93 samples"



Differential Expression Analysis

Batch characterisation

SiteHM

Site HM indicates the laboratory the samples belong to (Harvard/Maryland). Diagnosis groups are not balanced between sites!!! Using ComBat for batch correction could erase biological signals related to the diagnosis

table(datMeta$SiteHM, datMeta$Dx)
##    
##     Autism Control
##   H     30       9
##   M     11      43

Looking for unknown sources of batch effects with sva

Can’t use svaseq because data is not log counts, so using sva instead

mod = model.matrix(~ Dx, datMeta)
mod0 = model.matrix(~ 1, datMeta)
sva_fit = datExpr %>% as.matrix %>% sva(mod=mod, mod0=mod0)
## Number of significant surrogate variables is:  5 
## Iteration (out of 5 ):1  2  3  4  5
rm(mod, mod0)

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV',1:ncol(sv_data))

datMeta = cbind(datMeta, sv_data)

rm(sv_data)



Differential Expression Analysis

Using lmFit

mod = model.matrix(~ SV1 + SV2 + SV3 + SV4 + SV5 + Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$sampleid)
lmfit = lmFit(datExpr, design=mod, block=datMeta$sampleid, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr)) %>% mutate('ID'=sub('\\..*', '', rownames(.)))

DE_info = gene_info %>% left_join(top_genes, by='ID')

rm(mod, corfit, lmfit, fit, top_genes)



Visualisations


Samples

PCA: There doesn’t seem to be a specific pattern related to gender or age, there does seem to be a small differentiation by diagnosis in the 2nd principal component

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            left_join(datMeta, by='ID') %>% 
            dplyr::select('PC1','PC2','Age','age_group','Gender','Dx') %>%
            mutate('sqrt_age' = sqrt(Age))

selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
rm(pca, plot_data)


Genes

  • First Principal Component explains 92% of the total variance

  • There’s a really strong (negative) correlation between the mean expression of a gene and the 1st principal component

  • It seems like the lower expressed points that make a weird turn on the PC2 axis have a smaller PC3 value than the rest (rotate PC1,PC2 axis and it becomes visible). Haven’t found what else characterises them

pca = datExpr %>% prcomp

plot_data = data.frame('PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3], 
                       'MeanExpr'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.2) + theme_minimal() + 
     scale_color_viridis() + ggtitle('PCA') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

plot_data %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_data$MeanExpr, size=1) %>%
    layout(title = 'PCA 3D',
           scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca)$importance[2,1]*100,2),'%)')),
                        yaxis=list(title=glue('PC2 (',round(summary(pca)$importance[2,2]*100,2),'%)')),
                        zaxis=list(title=glue('PC3 (',round(summary(pca)$importance[2,3]*100,2),'%)'))))
rm(pca, plot_data)



Mean and SD by SFARI score

The higher the SFARI score the higher the mean expression but the lower the standard deviation. Same as Gandal’s dataset!

plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr)),
                       'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(gene_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)




SFARI scores

Mean and SD

The higher the SFARI score the higher the mean expression but the lower the standard deviation. Same as Gandal’s dataset!

plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr)),
                       'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(DE_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)


Log Fold Change

From scores 1 through 4, the higher the SFARI score, the lower the log Fold Change. Same behaviour as Gandal!

ggplotly(DE_info %>% ggplot(aes(x=gene.score, y=abs(logFC), fill=gene.score)) + 
         geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal() + theme(legend.position='none'))
## Warning: Removed 9957 rows containing non-finite values (stat_boxplot).




Save preprocessed dataset

save(datExpr, datMeta, datGenes, DE_info, file='./../Data/Gupta/preprocessed_data.RData')
#load('./../Data/Gupta/preprocessed_data.RData')



Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] glue_1.3.1             sva_3.30.1             BiocParallel_1.16.6   
##  [4] genefilter_1.64.0      mgcv_1.8-26            nlme_3.1-137          
##  [7] edgeR_3.24.3           limma_3.38.3           Rtsne_0.15            
## [10] vsn_3.50.0             Biobase_2.42.0         BiocGenerics_0.28.0   
## [13] WGCNA_1.66             fastcluster_1.1.25     dynamicTreeCut_1.63-1 
## [16] forcats_0.3.0          stringr_1.4.0          dplyr_0.8.0.1         
## [19] purrr_0.3.1            readr_1.3.1            tidyr_0.8.3           
## [22] tibble_2.1.1           tidyverse_1.2.1        viridis_0.5.1         
## [25] viridisLite_0.3.0      gridExtra_2.3          plotlyutils_0.0.0.9000
## [28] plotly_4.8.0           ggplot2_3.1.0          biomaRt_2.38.0        
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1      htmlTable_1.11.2      base64enc_0.1-3      
##   [4] rstudioapi_0.7        affyio_1.52.0         bit64_0.9-7          
##   [7] AnnotationDbi_1.44.0  mvtnorm_1.0-7         lubridate_1.7.4      
##  [10] xml2_1.2.0            codetools_0.2-15      splines_3.5.2        
##  [13] doParallel_1.0.14     impute_1.56.0         robustbase_0.93-0    
##  [16] knitr_1.22            Formula_1.2-3         jsonlite_1.6         
##  [19] Cairo_1.5-9           annotate_1.60.1       broom_0.5.1          
##  [22] cluster_2.0.7-1       GO.db_3.7.0           shiny_1.2.0          
##  [25] BiocManager_1.30.4    rrcov_1.4-3           compiler_3.5.2       
##  [28] httr_1.4.0            backports_1.1.2       assertthat_0.2.1     
##  [31] Matrix_1.2-15         lazyeval_0.2.2        cli_1.1.0            
##  [34] later_0.8.0           acepack_1.4.1         htmltools_0.3.6      
##  [37] prettyunits_1.0.2     tools_3.5.2           gtable_0.2.0         
##  [40] affy_1.60.0           Rcpp_1.0.1            cellranger_1.1.0     
##  [43] preprocessCore_1.44.0 crosstalk_1.0.0       iterators_1.0.9      
##  [46] xfun_0.5              rvest_0.3.2           mime_0.6             
##  [49] statmod_1.4.30        XML_3.98-1.11         DEoptimR_1.0-8       
##  [52] MASS_7.3-51.1         zlibbioc_1.28.0       scales_1.0.0         
##  [55] promises_1.0.1        hms_0.4.2             RColorBrewer_1.1-2   
##  [58] curl_3.3              yaml_2.2.0            memoise_1.1.0        
##  [61] rpart_4.1-13          latticeExtra_0.6-28   stringi_1.4.3        
##  [64] RSQLite_2.1.1         S4Vectors_0.20.1      pcaPP_1.9-73         
##  [67] foreach_1.4.4         checkmate_1.8.5       rlang_0.3.2          
##  [70] pkgconfig_2.0.2       matrixStats_0.54.0    bitops_1.0-6         
##  [73] evaluate_0.13         lattice_0.20-38       labeling_0.3         
##  [76] htmlwidgets_1.3       bit_1.1-14            tidyselect_0.2.5     
##  [79] robust_0.4-18         plyr_1.8.4            magrittr_1.5         
##  [82] R6_2.4.0              IRanges_2.16.0        generics_0.0.2       
##  [85] Hmisc_4.1-1           fit.models_0.5-14     DBI_1.0.0            
##  [88] pillar_1.3.1          haven_1.1.1           foreign_0.8-71       
##  [91] withr_2.1.2           survival_2.43-3       RCurl_1.95-4.10      
##  [94] nnet_7.3-12           modelr_0.1.4          crayon_1.3.4         
##  [97] rmarkdown_1.12        progress_1.2.0        locfit_1.5-9.1       
## [100] grid_3.5.2            readxl_1.1.0          data.table_1.12.0    
## [103] blob_1.1.1            digest_0.6.18         xtable_1.8-3         
## [106] httpuv_1.5.0          stats4_3.5.2          munsell_0.5.0